R Markdown

This script goes through demographics, cnb scores, health, and psych summaries, adds clustering information, runs statistics and makes graphs from results.

Part 1 : Read in csv.s -This script reads in demographics, clinical scores, health and psych summaries, merges them, removes NAs, codes and separates by depression.

Part 2 : merge with hydra -It then merges these documents with hydra output (made in cbica), adding Hydra_k1 through Hydra_k10 columns (which represent the number of clusters) -The script reads in 3 different types of groups (matched, unmatched, and residualized unmatched groups), and also does all gender together as well as separating them by gender.

Part 3 : LM -The script then runs LM on each clinical score (clinical_measure ~ hydra_group).
-There is a test option that does this for all clinical measures and all hydra groups, but for the remainder of the analysis, Hydra_k2 was the only classification more deeply explored.

Part 4 : Anova -Anovas were also run on the results of the LM of each clinical value by cluster.

Part 5 : Graphing - Graphs were then made.
For continuous variables(age, medu1), the graphs represent means, with SEM as error bars For categorical variables (race, sex) the graphs are percentages (caucasian, male) per group, with chisq used to calculate significance

Part 6 : FDR Correction -FDR correction was calculated for each clinical measure ANOVA output -A table of the results was extracted

Part 7 : Demographics tables - Demographics tables for each group (matched, unmatched, resid) were produced

#######################################################
############ READ IN, MERGE AND SUBSET DATA############
#######################################################

#read in csvs
#######################################################
############ READ IN, MERGE AND SUBSET DATA############
#######################################################

#read in csvs
demographics <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/n9498_demographics_go1_20161212.csv", header = TRUE, sep = ",") #from /data/joy/BBL/projects/ballerDepHeterogen/data/n9498_demographics_go1_20161212.csv
clinical_scores <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/n9498_goassess_itemwise_bifactor_scores_20161219.csv", header = TRUE, sep = ",") #from /data/joy/BBL/projects/ballerDepHeterogen/data/n9498_goassess_itemwise_bifactor_scores_20161219.csv
health <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/n9498_health_20170405.csv", header = TRUE, sep = ",") #from /data/joy/BBL/projects/ballerDepHeterogen/data/n9498_health_20170405.csv
psych_summary <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/n9498_goassess_psych_summary_vars_20131014.csv", header = TRUE, sep = ",") #from /data/joy/BBL/projects/ballerDepHeterogen/data/n9498_goassess_psych_summary_vars_20131014.csv

#read in Hydra cluster data -> AG (all_gender), M(males), F(females) 
hydra_AG_matched_clusters <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/hydra_output_from_cbica/CogData_all_gender_matched_n1424_HydraSubtypes.csv", header = TRUE, sep = ",")
hydra_AG_unmatched_clusters <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/hydra_output_from_cbica/CogData_all_gender_unmatched_n3022_HydraSubtypes.csv", header = TRUE, sep = ",")
hydra_AG_residuals_clusters <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/hydra_output_from_cbica/CogData_all_gender_unmatched_resid_n3022_HydraSubtypes.csv", header = TRUE, sep = ",") 

hydra_F_matched_clusters <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/hydra_output_from_cbica/CogData_females_matched_n712_HydraSubtypes.csv", header = TRUE, sep = ",")
hydra_F_unmatched_clusters <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/hydra_output_from_cbica/CogData_females_unmatched_n1588_HydraSubtypes.csv", header = TRUE, sep = ",")
hydra_F_residuals_clusters <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/hydra_output_from_cbica/CogData_females_unmatched_resid_n1588_HydraSubtypes.csv", header = TRUE, sep = ",") 

hydra_M_matched_clusters <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/hydra_output_from_cbica/CogData_males_matched_n712_HydraSubtypes.csv", header = TRUE, sep = ",")
hydra_M_unmatched_clusters <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/hydra_output_from_cbica/CogData_males_unmatched_n1434_HydraSubtypes.csv", header = TRUE, sep = ",")
hydra_M_residuals_clusters <- read.csv("/Users/eballer/BBL/from_chead/ballerDepHeterogen/hydra_output_from_cbica/CogData_males_unmatched_resid_n1434_HydraSubtypes.csv", header = TRUE, sep = ",") 

#Coerce hydra values into factor format
hydra_names <- names(hydra_AG_matched_clusters[2:11])
hydra_AG_matched_clusters[hydra_names] <- lapply(hydra_AG_matched_clusters[hydra_names], factor)
hydra_AG_unmatched_clusters[hydra_names] <- lapply(hydra_AG_unmatched_clusters[hydra_names], factor)
hydra_AG_residuals_clusters[hydra_names] <- lapply(hydra_AG_residuals_clusters[hydra_names], factor)

hydra_F_matched_clusters[hydra_names] <- lapply(hydra_F_matched_clusters[hydra_names], factor)
hydra_F_unmatched_clusters[hydra_names] <- lapply(hydra_F_unmatched_clusters[hydra_names], factor)
hydra_F_residuals_clusters[hydra_names] <- lapply(hydra_F_residuals_clusters[hydra_names], factor)

hydra_M_matched_clusters[hydra_names] <- lapply(hydra_M_matched_clusters[hydra_names], factor)
hydra_M_unmatched_clusters[hydra_names] <- lapply(hydra_M_unmatched_clusters[hydra_names], factor)
hydra_M_residuals_clusters[hydra_names] <- lapply(hydra_M_residuals_clusters[hydra_names], factor)


############
#prepping and merging big clinical and demographics stuff
############

#remove people with NA for race, age, or sex.  START WITH N = 9498
demographics_noNA_race <- demographics[!is.na(demographics$race),] #everyone has a race, N = 9498
demographics_noNA_race_age <- demographics_noNA_race[!is.na(demographics_noNA_race$ageAtClinicalAssess1),] # 86 people do not have age at clinical assessment.  N = 9412
demographics_noNA_race_age_sex <- demographics_noNA_race_age[!is.na(demographics_noNA_race_age$sex),] #everyone has a sex, N = 9412

#remove people with NA for depression or total psych score, START WITH N = 9498
psych_summary_no_NA_dep <- psych_summary[!is.na(psych_summary$smry_dep),] #take out those with NA for depression, 87 people N = 9411
psych_summary_no_NA_dep_and_smry_psych_overall <- psych_summary_no_NA_dep[!is.na(psych_summary_no_NA_dep$smry_psych_overall_rtg),] #take out those with NA for overall psych rtg, no additional people lost, N = 9411

#merge demographics and clinical #this is if we want to include people without full demographic data
dem_clinical <- merge(demographics_noNA_race_age_sex, clinical_scores, by = "bblid") #merge demographics and clinical, N = 9350
psych_health <- merge(psych_summary_no_NA_dep_and_smry_psych_overall, health, by = "bblid") #merge psych and health, N = 9411
dem_clinical_psych_health_merged <- merge (dem_clinical, psych_health, by = "bblid") #merge all 3 csvs, lost 1 person [134716] (had demographics, but no psych ratings): N = 9350

#make subsets
subset_just_dep_and_no_medicalratingExclude <- subset.data.frame(dem_clinical_psych_health_merged, (medicalratingExclude == 0) & (smry_dep == 4), select = "bblid") #subset people who were not medically excluded and who are depressed, N = 776
subset_no_psych_no_medicalratingExclude <- subset.data.frame(dem_clinical_psych_health_merged, (medicalratingExclude == 0) & (smry_psych_overall_rtg < 4), select = "bblid") #subset people who are psychiatrically healthy, N = 2508
subset_dep_or_no_psych_and_no_medicalratingExclude <- subset.data.frame(dem_clinical_psych_health_merged, (medicalratingExclude == 0) & ((smry_dep == 4) | (smry_psych_overall_rtg <4))) #subset including both depressed and healthies, good for regressions, N = 3284

#would binarize depression smry score to -1 (less than 4, not depressed) and 1 (score 4 , depressed)
dep_binarized <- ifelse(subset_dep_or_no_psych_and_no_medicalratingExclude$smry_dep == 4, 1, -1)
subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED <- cbind(subset_dep_or_no_psych_and_no_medicalratingExclude, dep_binarized) #N = 3284

#make depression and gender into factor scores
subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED$dep_binarized <- as.factor(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED$dep_binarized)
subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED$sex <- as.factor(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED$sex)

#divide ageAtclinical by 12 for age
subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED$age_in_years <- subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED$ageAtClinicalAssess1/12

#race binarized for plotting purposes, caucasian 1, non-caucasion 0
subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED$race_binarized <- ifelse(as.factor(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED$race) == 1, 1, 0)

#remove people with NA in their cognitive measures
subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED <- subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED[complete.cases(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED[14:39]),]

################################################
### Merge clustering data with subsetted data ##
################################################

#AG (all gender), M (males), F (females) 
#AG Matched - n 1423
#AG Unmatched n = 3014
#AG Resid n = 3014
subset_with_clusters_AG_matched <- merge(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED, hydra_AG_matched_clusters, by = "bblid")
subset_with_clusters_AG_unmatched <- merge(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED, hydra_AG_unmatched_clusters, by = "bblid")
subset_with_clusters_AG_resid <- merge(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED, hydra_AG_residuals_clusters, by = "bblid")

#F Matched - n 952
#F Unmatched n = 1584
#F Resid n = 1585
subset_with_clusters_F_matched <- merge(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED, hydra_F_matched_clusters, by = "bblid")
subset_with_clusters_F_unmatched <- merge(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED, hydra_F_unmatched_clusters, by = "bblid")
subset_with_clusters_F_resid <- merge(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED, hydra_F_residuals_clusters, by = "bblid")

#M Matched - n 470
#M Unmatched n = 1430
#M Resid n = 1430
subset_with_clusters_M_matched <- merge(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED, hydra_M_matched_clusters, by = "bblid")
subset_with_clusters_M_unmatched <- merge(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED, hydra_M_unmatched_clusters, by = "bblid")
subset_with_clusters_M_resid <- merge(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED, hydra_M_residuals_clusters, by = "bblid")

#save objects
saveRDS(object = subset_with_clusters_AG_matched, file = "/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/subset_with_clusters_AG_matched_clinical.rds")
saveRDS(object = subset_with_clusters_AG_unmatched, file = "/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/subset_with_clusters_AG_unmatched_clinical.rds")
saveRDS(object = subset_with_clusters_AG_resid, file = "/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/subset_with_clusters_AG_resid_clinical.rds")

saveRDS(object = subset_with_clusters_F_matched, file = "/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/subset_with_clusters_F_matched_clinical.rds")
saveRDS(object = subset_with_clusters_F_unmatched, file = "/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/subset_with_clusters_F_unmatched_clinical.rds")
saveRDS(object = subset_with_clusters_F_resid, file = "/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/subset_with_clusters_F_resid_clinical.rds")

saveRDS(object = subset_with_clusters_M_matched, file = "/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/subset_with_clusters_M_matched_clinical.rds")
saveRDS(object = subset_with_clusters_M_unmatched, file = "/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/subset_with_clusters_M_unmatched_clinical.rds")
saveRDS(object = subset_with_clusters_M_resid, file = "/Users/eballer/BBL/from_chead/ballerDepHeterogen/data/subset_with_clusters_M_resid_clinical.rds")

Including Graphs

#################################
# Linear Model for each measure #
##### Results stored in list ####
#################################
#get clinical measure names (grep factor)  
clinical_measure_names <- names(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED)[grep("factor", names(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED))] #get the names of all the columns with _z in the name
cluster_names <- colnames(hydra_AG_matched_clusters[,2:11])

clinical_measure_names_list <- names(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED)[grep("factor", names(subset_dep_or_no_psych_and_no_medicalratingExclude_DEPBINARIZED))] #get the names of all the columns with _z in the name
cluster_names_list <- colnames(hydra_AG_matched_clusters[,2:11])


#### All Hydra clusters in embedded list######
clinical_cog_score_cluster_stats_lm_AG_matched_by_cluster_1through10 <- lapply(cluster_names_list, function(cluster_name)
{
  clinical_cog_score_cluster_stats_lm_AG_matched_withincluster<- lapply(clinical_measure_names_list, cluster=as.name(cluster_name), function(clinical_measure_name, cluster) 
  {
    clinical_measure <- as.name(clinical_measure_name)
    lm(substitute(clinical_measure ~ cluster, list(clinical_measure = clinical_measure, cluster = cluster)), data = subset_with_clusters_AG_matched)
  })
  setNames(clinical_cog_score_cluster_stats_lm_AG_matched_withincluster, clinical_measure_names_list)
})
names(clinical_cog_score_cluster_stats_lm_AG_matched_by_cluster_1through10) <- cluster_names_list

##### Just Hydra_3 clusters ######
clinical_cog_score_cluster_stats_lm_AG_matched <- lapply(clinical_measure_names, function(x) 
{
  lm(substitute(i ~ Hydra_k2, list(i = as.name(x))), data = subset_with_clusters_AG_matched)
})

clinical_cog_score_cluster_stats_lm_AG_unmatched <- lapply(clinical_measure_names, function(x) 
{
  lm(substitute(i ~ Hydra_k2, list(i = as.name(x))), data = subset_with_clusters_AG_unmatched)
})

clinical_cog_score_cluster_stats_lm_AG_resid <- lapply(clinical_measure_names, function(x) 
{
  lm(substitute(i ~ Hydra_k2, list(i = as.name(x))), data = subset_with_clusters_AG_resid)
})


##############################
####### Statistics ###########
##############################

######## All hydra clusters, Anova results ########
clinical_cog_score_cluster_stats_anova_AG_matched_by_cluster_1through10 <- lapply(cluster_names_list, function(cluster_name)
{
  clinical_cog_score_cluster_stats_anova_AG_withincluster <- lapply(clinical_measure_names_list, cluster=as.name(cluster_name), function(clinical_measure_name, cluster) 
  {
    clinical_measure <- as.name(clinical_measure_name)
    anova(lm(substitute(clinical_measure ~ cluster, list(clinical_measure = clinical_measure, cluster = cluster)), data = subset_with_clusters_AG_matched))
  })
  setNames(clinical_cog_score_cluster_stats_anova_AG_withincluster, clinical_measure_names_list)
})
names(clinical_cog_score_cluster_stats_anova_AG_matched_by_cluster_1through10) <- cluster_names_list


#####Just Hydra 3 clusters ANOVA ########
clinical_cog_score_cluster_stats_anova_AG_matched <- lapply(clinical_measure_names, function(x) 
{
  anova(lm(substitute(i ~ Hydra_k2, list(i = as.name(x))), data = subset_with_clusters_AG_matched))
})

clinical_cog_score_cluster_stats_anova_AG_unmatched <- lapply(clinical_measure_names, function(x) 
{
  anova(lm(substitute(i ~ Hydra_k2, list(i = as.name(x))), data = subset_with_clusters_AG_unmatched))
})

clinical_cog_score_cluster_stats_anova_AG_resid <- lapply(clinical_measure_names, function(x) 
{
  anova(lm(substitute(i ~ Hydra_k2, list(i = as.name(x))), data = subset_with_clusters_AG_resid))
})

names(clinical_cog_score_cluster_stats_lm_AG_matched) <- clinical_measure_names
names(clinical_cog_score_cluster_stats_lm_AG_unmatched) <- clinical_measure_names
names(clinical_cog_score_cluster_stats_lm_AG_resid) <- clinical_measure_names

names(clinical_cog_score_cluster_stats_anova_AG_matched) <- clinical_measure_names
names(clinical_cog_score_cluster_stats_anova_AG_unmatched) <- clinical_measure_names
names(clinical_cog_score_cluster_stats_anova_AG_resid) <- clinical_measure_names

#checkmodel with visreg, uncomment when want to check
invisible(lapply(clinical_cog_score_cluster_stats_lm_AG_matched, function(x) {visreg(x)}))

invisible(lapply(clinical_cog_score_cluster_stats_lm_AG_unmatched, function(x) {visreg(x)}))

invisible(lapply(clinical_cog_score_cluster_stats_lm_AG_resid, function(x) {visreg(x)}))

#WILL HAVE TO DO THIS FOR MALES AND FEMALES#

###################################################
### Average Z-scores for accuracy and speed #######
###################################################

#Get stats of mean accuracy, processing speed and efficiency - TD in the middle, group 1 lower accuracy, processing speed and efficiency, 2 is high on all 3
df_mean_mood <- c(mean(subset_with_clusters_AG_matched$mood_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == -1)]),
                       mean(subset_with_clusters_AG_matched$mood_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == 1)]),
                       mean(subset_with_clusters_AG_matched$mood_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == 2)]))

df_mean_psychosis <- c(mean(subset_with_clusters_AG_matched$psychosis_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == -1)]),
                                mean(subset_with_clusters_AG_matched$psychosis_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == 1)]),
                                mean(subset_with_clusters_AG_matched$psychosis_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == 2)]))

df_mean_phobias <- c(mean(subset_with_clusters_AG_matched$phobias_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == -1)]),
                          mean(subset_with_clusters_AG_matched$phobias_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == 1)]),
                          mean(subset_with_clusters_AG_matched$phobias_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == 2)]))

df_mean_externalizing <- c(mean(subset_with_clusters_AG_matched$externalizing_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == -1)]),
                           mean(subset_with_clusters_AG_matched$externalizing_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == 1)]),
                           mean(subset_with_clusters_AG_matched$externalizing_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == 2)]))

df_mean_overall <- c(mean(subset_with_clusters_AG_matched$overall_psychopathology_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == -1)]),
                     mean(subset_with_clusters_AG_matched$overall_psychopathology_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == 1)]),
                     mean(subset_with_clusters_AG_matched$overall_psychopathology_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == 2)]))

#error bars: will go through each type of symptom, get mean and standard deviation, then divide to get sem
clinical_data_mood_sd_sem <- data.frame(cl = c("TD", "Cluster1", "Cluster2"), mean_mood = c(mean(subset_with_clusters_AG_matched$mood_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==-1)]),
                                                                                            mean(subset_with_clusters_AG_matched$mood_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]),
                                                                                            mean(subset_with_clusters_AG_matched$mood_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])),
                                                                              sd_mood = c(sd(subset_with_clusters_AG_matched$mood_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == -1)]),
                                                                                          sd(subset_with_clusters_AG_matched$mood_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]),
                                                                                          sd(subset_with_clusters_AG_matched$mood_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])))

clinical_data_psychosis_sd_sem <- data.frame(cl = c("TD", "Cluster1", "Cluster2"), mean_psychosis = c(mean(subset_with_clusters_AG_matched$psychosis_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==-1)]),
                                                                                            mean(subset_with_clusters_AG_matched$psychosis_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]),
                                                                                            mean(subset_with_clusters_AG_matched$psychosis_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])),
                                                    sd_psychosis = c(sd(subset_with_clusters_AG_matched$psychosis_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == -1)]),
                                                    sd(subset_with_clusters_AG_matched$psychosis_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]),
                                                    sd(subset_with_clusters_AG_matched$psychosis_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])))

clinical_data_phobias_sd_sem <- data.frame(cl = c("TD", "Cluster1", "Cluster2"), mean_phobias = c(mean(subset_with_clusters_AG_matched$phobias_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==-1)]),
                                                                                            mean(subset_with_clusters_AG_matched$phobias_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]),
                                                                                            mean(subset_with_clusters_AG_matched$phobias_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])),
                                                    sd_phobias = c(sd(subset_with_clusters_AG_matched$phobias_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == -1)]),
                                                    sd(subset_with_clusters_AG_matched$phobias_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]),
                                                    sd(subset_with_clusters_AG_matched$phobias_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])))

clinical_data_externalizing_sd_sem <- data.frame(cl = c("TD", "Cluster1", "Cluster2"), mean_externalizing = c(mean(subset_with_clusters_AG_matched$externalizing_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==-1)]),
                                                                                            mean(subset_with_clusters_AG_matched$externalizing_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]),
                                                                                            mean(subset_with_clusters_AG_matched$externalizing_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])),
                                                    sd_externalizing = c(sd(subset_with_clusters_AG_matched$externalizing_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == -1)]),
                                                    sd(subset_with_clusters_AG_matched$externalizing_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]),
                                                    sd(subset_with_clusters_AG_matched$externalizing_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])))
clinical_data_overall_sd_sem <- data.frame(cl = c("TD", "Cluster1", "Cluster2"), mean_overall = c(mean(subset_with_clusters_AG_matched$overall_psychopathology_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==-1)]),
                                                                                                              mean(subset_with_clusters_AG_matched$overall_psychopathology_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]),
                                                                                                              mean(subset_with_clusters_AG_matched$overall_psychopathology_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])),
                                                                      sd_overall = c(sd(subset_with_clusters_AG_matched$overall_psychopathology_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 == -1)]),
                                                                      sd(subset_with_clusters_AG_matched$overall_psychopathology_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]),
                                                                      sd(subset_with_clusters_AG_matched$overall_psychopathology_4factorv2[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])))

clinical_data_mood_sd_sem$sem = clinical_data_mood_sd_sem$sd_mood/sqrt(nrow(subset_with_clusters_AG_matched))
clinical_data_psychosis_sd_sem$sem = clinical_data_psychosis_sd_sem$sd_psychosis/sqrt(nrow(subset_with_clusters_AG_matched))
clinical_data_phobias_sd_sem$sem = clinical_data_phobias_sd_sem$sd_phobias/sqrt(nrow(subset_with_clusters_AG_matched))
clinical_data_externalizing_sd_sem$sem = clinical_data_externalizing_sd_sem$sd_externalizing/sqrt(nrow(subset_with_clusters_AG_matched))
clinical_data_overall_sd_sem$sem = clinical_data_overall_sd_sem$sd_overall/sqrt(nrow(subset_with_clusters_AG_matched))

#make a data frame with all standard errors
all_sem <- data.frame(clinical_data_mood_sd_sem$sem, clinical_data_psychosis_sd_sem$sem, clinical_data_phobias_sd_sem$sem, clinical_data_externalizing_sd_sem$sem, clinical_data_mood_sd_sem$sem)
#make a single column of sem in the data frame mood (TD, Cluster1, Cluster 2), psychosis(TD, cluster 1, cluster 2), etc
all_sem_one_col <- data.frame(sem=unlist(all_sem, use.names = FALSE))

clinical_all_measures <- data.frame(cl=c("TD", "Cluster1", "Cluster2"), mood =c(df_mean_mood), psychosis = c(df_mean_psychosis), phobias = c(df_mean_phobias), extern = c(df_mean_externalizing), overall = c(df_mean_overall))
clinical_measures_for_plot <- melt(clinical_all_measures)
## Using cl as id variables
#add sems to clinical_measures_for_plot
clinical_measures_for_plot$sem <- all_sem_one_col
names(clinical_measures_for_plot) <- c("cluster", "clinical", "z_score", "sem")
ggplot(data = clinical_measures_for_plot, aes(x = clinical, y = z_score, group = cluster)) + 
  geom_line(aes(color=cluster)) +
  geom_point(aes(color=cluster)) + 
  geom_errorbar(aes(ymin=z_score-sem, ymax=z_score+sem), width=.1) +
  ggtitle("Hydra_k2 Clinical Measures") 

ggplot(clinical_all_measures, aes(x = cl, y = mood, fill=cl)) + geom_col() +
  scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2")) + xlab("Clusters") + ylab("mood") + 
  ggtitle("Hydra_k2 mood") #+ scale_fill_discrete(breaks=c("TD", "Cluster1", "Cluster2")) +

  guides(fill=guide_legend(title=NULL))
## $fill
## $title
## NULL
## 
## $title.position
## NULL
## 
## $title.theme
## NULL
## 
## $title.hjust
## NULL
## 
## $title.vjust
## NULL
## 
## $label
## [1] TRUE
## 
## $label.position
## NULL
## 
## $label.theme
## NULL
## 
## $label.hjust
## NULL
## 
## $label.vjust
## NULL
## 
## $keywidth
## NULL
## 
## $keyheight
## NULL
## 
## $direction
## NULL
## 
## $override.aes
## list()
## 
## $nrow
## NULL
## 
## $ncol
## NULL
## 
## $byrow
## [1] FALSE
## 
## $reverse
## [1] FALSE
## 
## $order
## [1] 0
## 
## $available_aes
## [1] "any"
## 
## $name
## [1] "legend"
## 
## attr(,"class")
## [1] "guide"  "legend"
## 
## attr(,"class")
## [1] "guides"
ggplot(clinical_all_measures, aes(x = cl, y = psychosis, fill=cl)) + geom_col() +
  scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2")) + xlab("Clusters") + ylab("psychosis") + 
  ggtitle("Hydra_k2 psychosis") #+ scale_fill_discrete(breaks=c("TD", "Cluster1", "Cluster2")) +

  guides(fill=guide_legend(title=NULL))
## $fill
## $title
## NULL
## 
## $title.position
## NULL
## 
## $title.theme
## NULL
## 
## $title.hjust
## NULL
## 
## $title.vjust
## NULL
## 
## $label
## [1] TRUE
## 
## $label.position
## NULL
## 
## $label.theme
## NULL
## 
## $label.hjust
## NULL
## 
## $label.vjust
## NULL
## 
## $keywidth
## NULL
## 
## $keyheight
## NULL
## 
## $direction
## NULL
## 
## $override.aes
## list()
## 
## $nrow
## NULL
## 
## $ncol
## NULL
## 
## $byrow
## [1] FALSE
## 
## $reverse
## [1] FALSE
## 
## $order
## [1] 0
## 
## $available_aes
## [1] "any"
## 
## $name
## [1] "legend"
## 
## attr(,"class")
## [1] "guide"  "legend"
## 
## attr(,"class")
## [1] "guides"
ggplot(clinical_all_measures, aes(x = cl, y = phobias, fill=cl)) + geom_col() +
  scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2")) + xlab("Clusters") + ylab("phobias") + 
  ggtitle("Hydra_k2 phobias")# + scale_fill_discrete(breaks=c("TD", "Cluster1", "Cluster2")) +

  guides(fill=guide_legend(title=NULL))
## $fill
## $title
## NULL
## 
## $title.position
## NULL
## 
## $title.theme
## NULL
## 
## $title.hjust
## NULL
## 
## $title.vjust
## NULL
## 
## $label
## [1] TRUE
## 
## $label.position
## NULL
## 
## $label.theme
## NULL
## 
## $label.hjust
## NULL
## 
## $label.vjust
## NULL
## 
## $keywidth
## NULL
## 
## $keyheight
## NULL
## 
## $direction
## NULL
## 
## $override.aes
## list()
## 
## $nrow
## NULL
## 
## $ncol
## NULL
## 
## $byrow
## [1] FALSE
## 
## $reverse
## [1] FALSE
## 
## $order
## [1] 0
## 
## $available_aes
## [1] "any"
## 
## $name
## [1] "legend"
## 
## attr(,"class")
## [1] "guide"  "legend"
## 
## attr(,"class")
## [1] "guides"
ggplot(clinical_all_measures, aes(x = cl, y = extern, fill=cl)) + geom_col() +
  scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2")) + xlab("Clusters") + ylab("externalizing") + 
  ggtitle("Hydra_k2 externalizing") #+ scale_fill_discrete(breaks=c("TD", "Cluster1", "Cluster2")) +

  guides(fill=guide_legend(title=NULL))
## $fill
## $title
## NULL
## 
## $title.position
## NULL
## 
## $title.theme
## NULL
## 
## $title.hjust
## NULL
## 
## $title.vjust
## NULL
## 
## $label
## [1] TRUE
## 
## $label.position
## NULL
## 
## $label.theme
## NULL
## 
## $label.hjust
## NULL
## 
## $label.vjust
## NULL
## 
## $keywidth
## NULL
## 
## $keyheight
## NULL
## 
## $direction
## NULL
## 
## $override.aes
## list()
## 
## $nrow
## NULL
## 
## $ncol
## NULL
## 
## $byrow
## [1] FALSE
## 
## $reverse
## [1] FALSE
## 
## $order
## [1] 0
## 
## $available_aes
## [1] "any"
## 
## $name
## [1] "legend"
## 
## attr(,"class")
## [1] "guide"  "legend"
## 
## attr(,"class")
## [1] "guides"
ggplot(clinical_all_measures, aes(x = cl, y = overall, fill=cl)) + geom_col() +
  scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2")) + xlab("Clusters") + ylab("overall") + 
  ggtitle("Hydra_k2 overall")# + scale_fill_discrete(breaks=c("TD", "Cluster1", "Cluster2")) +

  guides(fill=guide_legend(title=NULL))
## $fill
## $title
## NULL
## 
## $title.position
## NULL
## 
## $title.theme
## NULL
## 
## $title.hjust
## NULL
## 
## $title.vjust
## NULL
## 
## $label
## [1] TRUE
## 
## $label.position
## NULL
## 
## $label.theme
## NULL
## 
## $label.hjust
## NULL
## 
## $label.vjust
## NULL
## 
## $keywidth
## NULL
## 
## $keyheight
## NULL
## 
## $direction
## NULL
## 
## $override.aes
## list()
## 
## $nrow
## NULL
## 
## $ncol
## NULL
## 
## $byrow
## [1] FALSE
## 
## $reverse
## [1] FALSE
## 
## $order
## [1] 0
## 
## $available_aes
## [1] "any"
## 
## $name
## [1] "legend"
## 
## attr(,"class")
## [1] "guide"  "legend"
## 
## attr(,"class")
## [1] "guides"
#######Chi-square for males/females and race########
##########By males, and by caucasians ##############

#Chi squared sex - significance p = 0.12
#if just want to look at clusters, not TD, look at commented line below, otherwise keep as is
###chisq_matched_sex <- chisq.test(subset_with_clusters_AG_matched$sex, (subset_with_clusters_AG_matched$Hydra_k2 != -1))
chisq_matched_sex <- chisq.test(subset_with_clusters_AG_matched$sex, subset_with_clusters_AG_matched$Hydra_k2)
chisq_matched_sex_pvalue <- chisq_matched_sex$p.value
total_people_Hydra_k2 <- c(length(subset_with_clusters_AG_matched$sex[which(subset_with_clusters_AG_matched$Hydra_k2 == -1)]), 
                           length(subset_with_clusters_AG_matched$sex[which(subset_with_clusters_AG_matched$Hydra_k2 == 1)]), 
                           length(subset_with_clusters_AG_matched$sex[which(subset_with_clusters_AG_matched$Hydra_k2 == 2)]))
                         #  length(subset_with_clusters_AG_matched$sex[which(subset_with_clusters_AG_matched$Hydra_k2 == 3)]))

num_men_all_clusters <- c(length(subset_with_clusters_AG_matched$sex[which(subset_with_clusters_AG_matched$Hydra_k2 == -1 & subset_with_clusters_AG_matched$sex == 1)]), 
                          length(subset_with_clusters_AG_matched$sex[which(subset_with_clusters_AG_matched$Hydra_k2 == 1 & subset_with_clusters_AG_matched$sex == 1)]), 
                          length(subset_with_clusters_AG_matched$sex[which(subset_with_clusters_AG_matched$Hydra_k2 == 2 & subset_with_clusters_AG_matched$sex == 1)])) 
                         # length(subset_with_clusters_AG_matched$sex[which(subset_with_clusters_AG_matched$Hydra_k2 == 3 & subset_with_clusters_AG_matched$sex == 1)]))
percent_men_all_clusters <- (num_men_all_clusters/total_people_Hydra_k2) * 100


#Chi square race - p-value = 4.854e-13
chisq_matched_race <- chisq.test(subset_with_clusters_AG_matched$race_binarized, subset_with_clusters_AG_matched$Hydra_k2)
chisq_matched_race_pvalue <- chisq_matched_race$p.value
num_caucasian_all_clusters <- c(length(subset_with_clusters_AG_matched$race_binarized[which(subset_with_clusters_AG_matched$Hydra_k2 == -1 & subset_with_clusters_AG_matched$race_binarized == 1)]), 
                                length(subset_with_clusters_AG_matched$race_binarized[which(subset_with_clusters_AG_matched$Hydra_k2 == 1 & subset_with_clusters_AG_matched$race_binarized == 1)]), 
                                length(subset_with_clusters_AG_matched$race_binarized[which(subset_with_clusters_AG_matched$Hydra_k2 == 2 & subset_with_clusters_AG_matched$race_binarized == 1)]))
                              #  length(subset_with_clusters_AG_matched$race_binarized[which(subset_with_clusters_AG_matched$Hydra_k2 == 3 & subset_with_clusters_AG_matched$race_binarized == 1)]))
percent_caucasian_all_clusters <- (num_caucasian_all_clusters/total_people_Hydra_k2)*100

p_values <- c("", chisq_matched_sex_pvalue, "", chisq_matched_race_pvalue)
dat_sex_race <- data.frame(cl = c("TD", "Cluster1", "Cluster2", "Significance"), 
                           num_males = c(num_men_all_clusters, "---"),
                           percent_males = round(c(percent_men_all_clusters, chisq_matched_sex_pvalue), 2), 
                           num_caucasians = c(num_caucasian_all_clusters, "---"),
                           percent_caucasian = round(c(percent_caucasian_all_clusters, chisq_matched_race_pvalue),2))
dat_sex_race_no_significance <-  data.frame(cl = c("TD", "Cluster1", "Cluster2"), 
                                            num_males = (num_men_all_clusters),
                                            percent_males = percent_men_all_clusters, 
                                            num_caucasians = num_caucasian_all_clusters,
                                            percent_caucasian = percent_caucasian_all_clusters)                 
ggplot(dat_sex_race_no_significance, aes(x = cl, y = percent_males, fill=cl)) + geom_col() +
  scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2")) + ylim(0, 100) + xlab("Clusters") + ylab("% Male") + 
  ggtitle("Hydra_k2 % Male, p = 0.0154") + scale_fill_discrete(breaks=c("TD", "Cluster1", "Cluster2")) +
  guides(fill=guide_legend(title=NULL))

ggplot(dat_sex_race_no_significance, aes(x = cl, y = percent_caucasian, fill=cl)) + geom_col() + 
  scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2")) + ylim(0, 100) + xlab("Clusters") + ylab("% Caucasian") + 
  ggtitle("Hydra_k2 % Caucasian, p = 2.187e-14") + scale_fill_discrete(breaks=c("TD", "Cluster1", "Cluster2")) + 
  guides(fill=guide_legend(title=NULL))

#############################
######### Bar Graphs ########
#############################

dat <- data.frame(cluster=c(subset_with_clusters_AG_matched$Hydra_k2), age=c(subset_with_clusters_AG_matched$age_in_years), medu1=c(subset_with_clusters_AG_matched$medu1), race=c(subset_with_clusters_AG_matched$race_binarized), sex_males=c(subset_with_clusters_AG_matched$sex))

#dat_age_sd_sem is the data frame that will be cluster(vertical), columns (age, medu), last rows are standard dec and sem
dat_age_sd_sem <- data.frame(cl = c("TD", "Cluster1", "Cluster2"), 
                             age = c(mean(subset_with_clusters_AG_matched$age_in_years[which(subset_with_clusters_AG_matched$Hydra_k2 ==-1)]), 
                                     mean(subset_with_clusters_AG_matched$age_in_years[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]), 
                                     mean(subset_with_clusters_AG_matched$age_in_years[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])), 
                             age_sd = c(sd(subset_with_clusters_AG_matched$age_in_years[which(subset_with_clusters_AG_matched$Hydra_k2 ==-1)]),
                                        sd(subset_with_clusters_AG_matched$age_in_years[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]), 
                                        sd(subset_with_clusters_AG_matched$age_in_years[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)]))) 
                                       
dat_age_sd_sem$sem <- dat_age_sd_sem$age_sd/sqrt(nrow(dat)) 
dat_medu_sd_sem <- data.frame(cl = c("TD", "Cluster1", "Cluster2"), 
                              medu = c(mean(subset_with_clusters_AG_matched$medu1[which(subset_with_clusters_AG_matched$Hydra_k2 ==-1)]), 
                                       mean(subset_with_clusters_AG_matched$medu1[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]), 
                                       mean(subset_with_clusters_AG_matched$medu1[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)])),
                              age_sd = c(sd(subset_with_clusters_AG_matched$medu1[which(subset_with_clusters_AG_matched$Hydra_k2 ==-1)]), 
                                         sd(subset_with_clusters_AG_matched$medu1[which(subset_with_clusters_AG_matched$Hydra_k2 ==1)]), 
                                         sd(subset_with_clusters_AG_matched$medu1[which(subset_with_clusters_AG_matched$Hydra_k2 ==2)]))) 
                                      
dat_medu_sd_sem$sem <- dat_medu_sd_sem$age_sd/sqrt(nrow(dat))

dat_cont <- data.frame(cluster=c(subset_with_clusters_AG_matched$Hydra_k2), age=c(subset_with_clusters_AG_matched$age_in_years), medu1=c(subset_with_clusters_AG_matched$medu1))
dat_cat <- data.frame(cluster=c(subset_with_clusters_AG_matched$Hydra_k2), race=c(subset_with_clusters_AG_matched$race_binarized), sex=c(subset_with_clusters_AG_matched$sex))
dat.m <- melt(dat, id.vars='cluster')
dat_cont.m <- melt(dat_cont, id.vars='cluster')
dat_cat.m <- melt(dat_cat, id.vars='cluster')

#ggplot(dat.m, aes(fill=cluster, x=cluster, y=value))+ geom_bar(stat="identity") + facet_grid(.~variable) + 
 # labs(x='Clusters_matched',y='') + scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2", "Cluster3")) 

ggplot(dat_age_sd_sem, aes(x = cl, y = age, fill = cl)) + geom_col() + 
  geom_errorbar(aes(ymin=age-sem, ymax=age+sem),width=.2,position=position_dodge(.9)) + 
  scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2")) + ylim(0, 18) + xlab("Clusters") + ylab("Age in Years") + 
  ggtitle("Age by Cluster") + scale_fill_discrete(breaks=c("TD", "Cluster1", "Cluster2")) +
  guides(fill=guide_legend(title=NULL)) 

ggplot(dat_medu_sd_sem, aes(x = cl, y = medu, fill = cl)) + geom_col() + 
  geom_errorbar(aes(ymin=medu-sem, ymax=medu+sem),width=.2,position=position_dodge(.9)) + 
  scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2")) + ylim(0, 18) + xlab("Clusters") + ylab("Maternal Ed in Years") + 
  ggtitle("Maternal Edu by Cluster") + scale_fill_discrete(breaks=c("TD", "Cluster1", "Cluster2")) +
  guides(fill=guide_legend(title=NULL)) 

#ggplot(dat_cont.m, aes(fill=cluster, x=cluster, y=value)) + geom_bar(stat='identity') + 
 # facet_grid(.~variable) + labs(x='Clusters_matched',y='') + 
  #scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2"))

#ggplot(dat_cat.m, aes(fill=cluster, x=cluster, y=value)) + geom_bar(stat='identity') + 
 # facet_grid(.~variable) + labs(x='Clusters_matched',y='') +
#  scale_x_discrete(limits=c("TD", "Cluster1", "Cluster2", "Cluster3"))

FDR correction

###################################################
## Extracting anovas with significant p values ####
###################################################

#Looking at Hydra_k2, but can redo with nested list
#Look at model summaries
models_anova <- lapply(clinical_cog_score_cluster_stats_anova_AG_matched, summary)

#Pull p-values
p_anova <- sapply(clinical_cog_score_cluster_stats_anova_AG_matched, function(v) v$"Pr(>F)"[1]) #$coef[,"Pr(>F)"][2]) #get the p value for dep binarized

#Convert to data frame
p_anova <- as.data.frame(p_anova)

#Print original p-values to three decimal places
p_round_anova <- round(p_anova,3)

#FDR correct p-values
pfdr_anova <- p.adjust(p_anova[,1],method="fdr")

#Convert to data frame
pfdr_anova <- as.data.frame(pfdr_anova)
row.names(pfdr_anova) <- clinical_measure_names

#To print fdr-corrected p-values to three decimal places
pfdr_round_anova <- round(pfdr_anova,3)

#List the NMF components that survive FDR correction
clinical_fdr_anova <- row.names(pfdr_anova)[pfdr_anova<0.05]

#make a data frame with names and fdr values (rounded to 3 decimals)
clinical_names_and_fdr_values_anova <- data.frame(cbind(clinical_fdr_anova, round(pfdr_anova[pfdr_anova<0.05],3)))

#add titles to names_and_fdr tables
names(clinical_names_and_fdr_values_anova) <- c("clinical_measure", "p_FDR_corr")

#write the results of the mass univariate stats to files
write.csv(clinical_names_and_fdr_values_anova, file = "/Users/eballer/BBL/from_chead/ballerDepHeterogen/results/Hydra_k2_clinical_20180315.csv")

print(clinical_names_and_fdr_values_anova)
##                    clinical_measure p_FDR_corr
## 1                    mood_4factorv2          0
## 2               psychosis_4factorv2       0.02
## 3           externalizing_4factorv2          0
## 4                 phobias_4factorv2          0
## 5 overall_psychopathology_4factorv2          0

Demographics for matched, unmatched and unmatched_resid group by cluster

#############################
####### Demographics ########
#############################

#######Matched group all clusters ##############
#subset demographics
listVars <- c("Race", "Sex", "Maternal Ed", "Age", "Depression", "Cluster") #Race 1 = caucasian, Maternal Ed = years, age = years, dep 1 = dep, 0 = non_dep
demo_Hydra_k2_AG_matched <- data.frame(subset_with_clusters_AG_matched$race_binarized, subset_with_clusters_AG_matched$sex, subset_with_clusters_AG_matched$medu1, subset_with_clusters_AG_matched$age_in_years, subset_with_clusters_AG_matched$dep_binarized, subset_with_clusters_AG_matched$Hydra_k2)
names(demo_Hydra_k2_AG_matched) <- c(listVars)

#Change categorical values to have names
demo_Hydra_k2_AG_matched$Depression <- ifelse(demo_Hydra_k2_AG_matched$Depression == 1, "Depressed", "Non-depressed")
demo_Hydra_k2_AG_matched$Race <- ifelse(demo_Hydra_k2_AG_matched$Race == 1, "Caucasian", "Non-caucasian")
demo_Hydra_k2_AG_matched$Sex <- ifelse(demo_Hydra_k2_AG_matched$Sex == 1, "Male", "Female")

#Define Categorical Variables
cat_variables <- c("Race", "Depression", "Sex", "Cluster")
title <- c("Hydra_k2 demographics")

#create demographics table
demo_Hydra_k2_AG_matched_table <- CreateTableOne(vars = listVars, data = demo_Hydra_k2_AG_matched, factorVars = cat_variables, strata = c("Cluster"))
print(demo_Hydra_k2_AG_matched_table, showAllLevels = TRUE)
##                          Stratified by Cluster
##                           level         -1             1             
##   n                                       711            376         
##   Race (%)                Caucasian       393 ( 55.3)    155 ( 41.2) 
##                           Non-caucasian   318 ( 44.7)    221 ( 58.8) 
##   Sex (%)                 Female          476 ( 66.9)    270 ( 71.8) 
##                           Male            235 ( 33.1)    106 ( 28.2) 
##   Maternal Ed (mean (sd))               14.14 (2.26)   13.75 (2.23)  
##   Age (mean (sd))                       16.11 (3.00)   15.66 (3.14)  
##   Depression (%)          Depressed         0 (  0.0)    376 (100.0) 
##                           Non-depressed   711 (100.0)      0 (  0.0) 
##   Cluster (%)             -1              711 (100.0)      0 (  0.0) 
##                           1                 0 (  0.0)    376 (100.0) 
##                           2                 0 (  0.0)      0 (  0.0) 
##                          Stratified by Cluster
##                           2              p      test
##   n                         336                     
##   Race (%)                  238 ( 70.8)  <0.001     
##                              98 ( 29.2)             
##   Sex (%)                   207 ( 61.6)   0.015     
##                             129 ( 38.4)             
##   Maternal Ed (mean (sd)) 14.53 (2.29)   <0.001     
##   Age (mean (sd))         16.66 (2.49)   <0.001     
##   Depression (%)            336 (100.0)  <0.001     
##                               0 (  0.0)             
##   Cluster (%)                 0 (  0.0)  <0.001     
##                               0 (  0.0)             
##                             336 (100.0)
#######Unmatched group ##############
#subset demographics
listVars <- c("Race", "Sex", "Maternal Ed", "Age", "Depression", "Cluster") #Race 1 = caucasian, Maternal Ed = years, age = years, dep 1 = dep, 0 = non_dep
demo_Hydra_k2_AG_unmatched <- data.frame(subset_with_clusters_AG_unmatched$race_binarized, subset_with_clusters_AG_unmatched$sex, subset_with_clusters_AG_unmatched$medu1, subset_with_clusters_AG_unmatched$age_in_years, subset_with_clusters_AG_unmatched$dep_binarized, subset_with_clusters_AG_unmatched$Hydra_k2)
names(demo_Hydra_k2_AG_unmatched) <- c(listVars)

#Change categorical values to have names
demo_Hydra_k2_AG_unmatched$Depression <- ifelse(demo_Hydra_k2_AG_unmatched$Depression == 1, "Depressed", "Non-depressed")
demo_Hydra_k2_AG_unmatched$Race <- ifelse(demo_Hydra_k2_AG_unmatched$Race == 1, "Caucasian", "Non-caucasian")
demo_Hydra_k2_AG_unmatched$Sex <- ifelse(demo_Hydra_k2_AG_unmatched$Sex == 1, "Male", "Female")

#Define Categorical Variables
cat_variables <- c("Race", "Depression", "Sex", "Cluster")

title <- c("Hydra_k2 demographics")

#create demographics table
demo_Hydra_k2_AG_unmatched_table <- CreateTableOne(vars = listVars, data = demo_Hydra_k2_AG_unmatched, factorVars = cat_variables, strata = c("Cluster"))
print(demo_Hydra_k2_AG_unmatched_table, showAllLevels = TRUE)
##                          Stratified by Cluster
##                           level         -1             1             
##   n                                      2297            376         
##   Race (%)                Caucasian      1469 ( 64.0)    177 ( 47.1) 
##                           Non-caucasian   828 ( 36.0)    199 ( 52.9) 
##   Sex (%)                 Female         1103 ( 48.0)    264 ( 70.2) 
##                           Male           1194 ( 52.0)    112 ( 29.8) 
##   Maternal Ed (mean (sd))               14.93 (2.45)   13.89 (2.24)  
##   Age (mean (sd))                       13.83 (3.72)   16.27 (2.84)  
##   Depression (%)          Depressed         0 (  0.0)    376 (100.0) 
##                           Non-depressed  2297 (100.0)      0 (  0.0) 
##   Cluster (%)             -1             2297 (100.0)      0 (  0.0) 
##                           1                 0 (  0.0)    376 (100.0) 
##                           2                 0 (  0.0)      0 (  0.0) 
##                          Stratified by Cluster
##                           2              p      test
##   n                         341                     
##   Race (%)                  218 ( 63.9)  <0.001     
##                             123 ( 36.1)             
##   Sex (%)                   217 ( 63.6)  <0.001     
##                             124 ( 36.4)             
##   Maternal Ed (mean (sd)) 14.37 (2.31)   <0.001     
##   Age (mean (sd))         15.96 (2.95)   <0.001     
##   Depression (%)            341 (100.0)  <0.001     
##                               0 (  0.0)             
##   Cluster (%)                 0 (  0.0)  <0.001     
##                               0 (  0.0)             
##                             341 (100.0)
#######Ununmatched residuals group ##############
#subset demographics
listVars <- c("Race", "Sex", "Maternal Ed", "Age", "Depression", "Cluster") #Race 1 = caucasian, Maternal Ed = years, age = years, dep 1 = dep, 0 = non_dep
demo_Hydra_k2_AG_resid <- data.frame(subset_with_clusters_AG_resid$race_binarized, subset_with_clusters_AG_resid$sex, subset_with_clusters_AG_resid$medu1, subset_with_clusters_AG_resid$age_in_years, subset_with_clusters_AG_resid$dep_binarized, subset_with_clusters_AG_resid$Hydra_k2)
names(demo_Hydra_k2_AG_resid) <- c(listVars)

#Change categorical values to have names
demo_Hydra_k2_AG_resid$Depression <- ifelse(demo_Hydra_k2_AG_resid$Depression == 1, "Depressed", "Non-depressed")
demo_Hydra_k2_AG_resid$Race <- ifelse(demo_Hydra_k2_AG_resid$Race == 1, "Caucasian", "Non-caucasian")
demo_Hydra_k2_AG_resid$Sex <- ifelse(demo_Hydra_k2_AG_resid$Sex == 1, "Male", "Female")

#make variable list
#table_titles <- c("Cluster 1", "Cluster 2")
#table_titles <- c("Cluster 1", "Depressed", "P-value")

#Define Categorical Variables
cat_variables <- c("Race", "Depression", "Sex", "Cluster")

title <- c("Hydra_k2 demographics")

#create demographics table
demo_Hydra_k2_AG_resid_table <- CreateTableOne(vars = listVars, data = demo_Hydra_k2_AG_resid, factorVars = cat_variables, strata = c("Cluster"))
print(demo_Hydra_k2_AG_resid_table, showAllLevels = TRUE)
##                          Stratified by Cluster
##                           level         -1             1             
##   n                                      2297            346         
##   Race (%)                Caucasian      1469 ( 64.0)    211 ( 61.0) 
##                           Non-caucasian   828 ( 36.0)    135 ( 39.0) 
##   Sex (%)                 Female         1103 ( 48.0)    219 ( 63.3) 
##                           Male           1194 ( 52.0)    127 ( 36.7) 
##   Maternal Ed (mean (sd))               14.93 (2.45)   14.34 (2.31)  
##   Age (mean (sd))                       13.83 (3.72)   16.02 (3.03)  
##   Depression (%)          Depressed         0 (  0.0)    346 (100.0) 
##                           Non-depressed  2297 (100.0)      0 (  0.0) 
##   Cluster (%)             -1             2297 (100.0)      0 (  0.0) 
##                           1                 0 (  0.0)    346 (100.0) 
##                           2                 0 (  0.0)      0 (  0.0) 
##                          Stratified by Cluster
##                           2              p      test
##   n                         371                     
##   Race (%)                  184 ( 49.6)  <0.001     
##                             187 ( 50.4)             
##   Sex (%)                   262 ( 70.6)  <0.001     
##                             109 ( 29.4)             
##   Maternal Ed (mean (sd)) 13.92 (2.25)   <0.001     
##   Age (mean (sd))         16.22 (2.77)   <0.001     
##   Depression (%)            371 (100.0)  <0.001     
##                               0 (  0.0)             
##   Cluster (%)                 0 (  0.0)  <0.001     
##                               0 (  0.0)             
##                             371 (100.0)